<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 4.2.0">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib/font-awesome/css/font-awesome.min.css">
  <link rel="stylesheet" href="//cdn.jsdelivr.net/gh/fancyapps/fancybox@3/dist/jquery.fancybox.min.css">
  <link rel="stylesheet" href="/lib/pace/pace-theme-minimal.min.css">
  <script src="/lib/pace/pace.min.js"></script>

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"lanqilu.github.io","root":"/","scheme":"Gemini","version":"7.8.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12,"onmobile":true},"copycode":{"enable":true,"show_result":true,"style":null},"back2top":{"enable":true,"sidebar":true,"scrollpercent":true},"bookmark":{"enable":false,"color":"#395ca3","save":"auto"},"fancybox":true,"mediumzoom":false,"lazyload":false,"pangu":true,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},"path":"search.xml"};
  </script>

  <meta name="description" content="Python网络数据采集笔记">
<meta property="og:type" content="article">
<meta property="og:title" content="Python网络数据采集">
<meta property="og:url" content="https://lanqilu.github.io/2020/03/24/Python/Python%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE%E9%87%87%E9%9B%86/index.html">
<meta property="og:site_name" content="Halo">
<meta property="og:description" content="Python网络数据采集笔记">
<meta property="og:locale" content="zh_CN">
<meta property="article:published_time" content="2020-03-24T08:40:49.000Z">
<meta property="article:modified_time" content="2020-03-27T14:36:44.728Z">
<meta property="article:author" content="Lanqilu">
<meta property="article:tag" content="Python">
<meta property="article:tag" content="爬虫">
<meta name="twitter:card" content="summary">

<link rel="canonical" href="https://lanqilu.github.io/2020/03/24/Python/Python%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE%E9%87%87%E9%9B%86/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true,
    lang   : 'zh-CN'
  };
</script>

  <title>Python网络数据采集 | Halo</title>
  
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-161500949-1"></script>
    <script data-pjax>
      if (CONFIG.hostname === location.hostname) {
        window.dataLayer = window.dataLayer || [];
        function gtag(){dataLayer.push(arguments);}
        gtag('js', new Date());
        gtag('config', 'UA-161500949-1');
      }
    </script>


  <script data-pjax>
    var _hmt = _hmt || [];
    (function() {
      var hm = document.createElement("script");
      hm.src = "https://hm.baidu.com/hm.js?256151d1651e9d73ec980b2fc69de8f6";
      var s = document.getElementsByTagName("script")[0];
      s.parentNode.insertBefore(hm, s);
    })();
  </script>




  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <span class="logo-line-before"><i></i></span>
      <h1 class="site-title">Halo</h1>
      <span class="logo-line-after"><i></i></span>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>




<nav class="site-nav">
  <ul id="menu" class="menu">
        <li class="menu-item menu-item-home">

    <a href="/" rel="section"><i class="fa fa-fw fa-home"></i>首页</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="fa fa-fw fa-tags"></i>标签<span class="badge">39</span></a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="fa fa-fw fa-th"></i>分类<span class="badge">18</span></a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="fa fa-fw fa-archive"></i>归档<span class="badge">85</span></a>

  </li>
        <li class="menu-item menu-item-python">

    <a href="/categories/Python/" rel="section"><i class="fa fa-fw fa-code"></i>Python</a>

  </li>
        <li class="menu-item menu-item-java">

    <a href="/categories/Java/" rel="section"><i class="fa fa-fw fa-code"></i>Java</a>

  </li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
        </a>
      </li>
  </ul>
</nav>



  <div class="search-pop-overlay">
    <div class="popup search-popup">
        <div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off"
           placeholder="搜索..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div id="search-result">
  <div id="no-result">
    <i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>
  </div>
</div>

    </div>
  </div>

</div>
    </header>

    
  <div class="reading-progress-bar"></div>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content post posts-expand">
            

    
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="https://lanqilu.github.io/2020/03/24/Python/Python%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE%E9%87%87%E9%9B%86/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="http://img.whl123456.top/image/avatar.jpg">
      <meta itemprop="name" content="Lanqilu">
      <meta itemprop="description" content="">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Halo">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          Python网络数据采集
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              <span class="post-meta-item-text">发表于</span>

              <time title="创建时间：2020-03-24 16:40:49" itemprop="dateCreated datePublished" datetime="2020-03-24T16:40:49+08:00">2020-03-24</time>
            </span>
              <span class="post-meta-item">
                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                <span class="post-meta-item-text">更新于</span>
                <time title="修改时间：2020-03-27 22:36:44" itemprop="dateModified" datetime="2020-03-27T22:36:44+08:00">2020-03-27</time>
              </span>
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              <span class="post-meta-item-text">分类于</span>
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/Python/" itemprop="url" rel="index"><span itemprop="name">Python</span></a>
                </span>
            </span>

          

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <div class="note info no-icon">
            <p>Python网络数据采集笔记</p>
          </div>

<a id="more"></a>

<hr>
<h2 id="网络数据采集的基本原理"><a href="#网络数据采集的基本原理" class="headerlink" title="网络数据采集的基本原理"></a>网络数据采集的基本原理</h2><p>对网络爬虫通常的想法：</p>
<ul>
<li>通过网站域名获取 HTML 数据</li>
<li>根据目标信息解析数据</li>
<li>存储目标信息</li>
<li>如果有必要，移动到另一个网页重复这个过程</li>
</ul>
<h2 id="Python实现网络连接"><a href="#Python实现网络连接" class="headerlink" title="Python实现网络连接"></a>Python实现网络连接</h2><p>使用python获取网页，代码实现</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> urllib.request <span class="keyword">import</span> urlopen</span><br><span class="line">html = urlopen(<span class="string">"http://www.baidu.com"</span>)</span><br><span class="line">print(html.read())</span><br></pre></td></tr></table></figure>

<h3 id="urllib"><a href="#urllib" class="headerlink" title="urllib"></a>urllib</h3><p>urllib 是 Python 的标准库，包含了从网络请求数据，处理cookie，甚至改变像请求头和用户代理这些元数据的函数。</p>
<p>urlopen 用来打开并读取一个从网络获取的远程对象。</p>
<blockquote>
<p>Python 2.x 里的的是 urllib2 库，在 Python 3.x 里，urllib2 改名为 urllib</p>
</blockquote>
<h3 id="BeautifulSoup"><a href="#BeautifulSoup" class="headerlink" title="BeautifulSoup"></a>BeautifulSoup</h3><p>BeautifulSoup通过定位 HTML 标签来 格式化和组织复杂的网络信息，用简单易用的 Python 对象为我们展现 XML 结构信息。</p>
<p>Pycharm可以自动导入其他安装可参考该<a href="https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/#id8" target="_blank" rel="noopener">文档</a>，有关Python虚拟环境可参考该<a href="https://lanqilu.github.io/2019/10/12/Python/配置虚拟环境venv/">文章</a></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install beautifulsoup4</span><br></pre></td></tr></table></figure>

<p>使用BeautifulSoup</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> urllib.request <span class="keyword">import</span> urlopen</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line">html = urlopen(<span class="string">"https://lanqilu.github.io/"</span>)</span><br><span class="line">bsObj = BeautifulSoup(html.read())</span><br><span class="line">print(bsObj.h1)</span><br></pre></td></tr></table></figure>

<h3 id="网络连接异常"><a href="#网络连接异常" class="headerlink" title="网络连接异常"></a>网络连接异常</h3><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">html = urlopen(<span class="string">"https://lanqilu.github.io/"</span>)</span><br></pre></td></tr></table></figure>

<p>可能发生的异常有两种</p>
<ul>
<li>网页在服务器上不存在（或者获取页面的时候出现错误）</li>
<li>服务器不存在</li>
</ul>
<p>第一种异常发生时，程序会返回 HTTP 错误。可以查看<a href="https://lanqilu.github.io/2020/03/21/Web/HTTP状态码/">HTTP状态码</a>。所有类似情形，urlopen 函数都会抛出“HTTPError”异常</p>
<p>处理方式</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> urllib.error <span class="keyword">import</span> HTTPError</span><br><span class="line"><span class="keyword">from</span> urllib.request <span class="keyword">import</span> urlopen</span><br><span class="line"></span><br><span class="line"><span class="keyword">try</span>:</span><br><span class="line">    html = urlopen(<span class="string">"https://lanqilu.github.io/2018"</span>) <span class="comment"># 异常网站 </span></span><br><span class="line"><span class="keyword">except</span> HTTPError <span class="keyword">as</span> e:</span><br><span class="line">    print(e)</span><br><span class="line">    <span class="comment"># 返回空值，中断程序，或者执行另一个方案</span></span><br><span class="line"><span class="keyword">else</span>:</span><br><span class="line">    print(<span class="string">"程序继续"</span>)</span><br><span class="line"><span class="comment"># 注意：如果你已经在上面异常捕捉那一段代码里返回或中断（break）</span></span><br><span class="line"><span class="comment"># 那么就不需要使用else语句了，这段代码也不会执行</span></span><br></pre></td></tr></table></figure>

<p>如果服务器不存在（URL链接打不开），urlopen 会返回一个 None 对象。</p>
<p>要调用的标签不存在，BeautifulSoup 就会返回 None 对象</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> urllib.request <span class="keyword">import</span> urlopen</span><br><span class="line"><span class="keyword">from</span> urllib.error <span class="keyword">import</span> HTTPError</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">getTitle</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="keyword">try</span>:</span><br><span class="line">        html = urlopen(url)</span><br><span class="line">    <span class="keyword">except</span> HTTPError <span class="keyword">as</span> e:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">    <span class="keyword">try</span>:</span><br><span class="line">        bsObj = BeautifulSoup(html.read(), <span class="string">"lxml"</span>)</span><br><span class="line">        title = bsObj.body.h1</span><br><span class="line">    <span class="keyword">except</span> AttributeError <span class="keyword">as</span> e:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">None</span></span><br><span class="line">    <span class="keyword">return</span> title</span><br><span class="line"></span><br><span class="line">title = getTitle(<span class="string">"https://www.runoob.com/"</span>)</span><br><span class="line"><span class="keyword">if</span> title <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">    print(<span class="string">"Title could not be found"</span>)</span><br><span class="line"><span class="keyword">else</span>:</span><br><span class="line">    print(title)</span><br></pre></td></tr></table></figure>

<h2 id="HTML解析"><a href="#HTML解析" class="headerlink" title="HTML解析"></a>HTML解析</h2><p>属性查找标签的方法，标签组的使用，以及标签解析树的导航过程</p>
<p>网络爬虫可以通过 class 属性的值，轻松地区分出两种不同的标签。</p>
<p>例如：</p>
<figure class="highlight html"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">"<span class="tag">&lt;<span class="name">span</span> <span class="attr">class</span>=<span class="string">"red"</span>&gt;</span>Heavens! what a virulent attack!<span class="tag">&lt;/<span class="name">span</span>&gt;</span>" replied <span class="tag">&lt;<span class="name">span</span> <span class="attr">class</span>=</span></span><br><span class="line"><span class="tag">"<span class="attr">green</span>"&gt;</span>the prince<span class="tag">&lt;/<span class="name">span</span>&gt;</span>, not in the least disconcerted by this reception.</span><br></pre></td></tr></table></figure>

<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> urllib.request <span class="keyword">import</span> urlopen</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"><span class="comment">#%%</span></span><br><span class="line"><span class="comment"># 爬取页面</span></span><br><span class="line">html = urlopen(<span class="string">"http://www.pythonscraping.com/pages/warandpeace.html"</span>)</span><br><span class="line">bsObj = BeautifulSoup(html, features=<span class="string">"lxml"</span>) <span class="comment"># features 选择解析器</span></span><br><span class="line"><span class="comment"># 以用 findAll 函数抽取只包含在 &lt;span class="green"&gt;&lt;/span&gt; 标签里的文字</span></span><br><span class="line">nameList = bsObj.findAll(<span class="string">"span"</span>, &#123;<span class="string">"class"</span>: <span class="string">"green"</span>&#125;)</span><br><span class="line"><span class="comment"># 打印输出</span></span><br><span class="line"><span class="keyword">for</span> name <span class="keyword">in</span> nameList:</span><br><span class="line">    print(name.get_text())</span><br></pre></td></tr></table></figure>

<blockquote>
<p><code>get_text()</code></p>
<p><code>.get_text()</code>会把正在处理的 HTML 文档中所有的标签都清除，然后返回 一个只包含文字的字符串。假如你正在处理一个包含许多超链接、段落和标签的大段源代码，那么<code>.get_text()</code> 会把这些超链接、段落和标签都清除掉， 只剩下一串不带标签的文字。</p>
</blockquote>
<h3 id="BeautifulSoup的find-和findAll"><a href="#BeautifulSoup的find-和findAll" class="headerlink" title="BeautifulSoup的find()和findAll()"></a>BeautifulSoup的<code>find()</code>和<code>findAll()</code></h3><p><code>findAll</code>函数通过标签的名称和属性来查找标签</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">findAll(tag, attributes, recursive, text, limit, keywords)</span><br><span class="line">find(tag, attributes, recursive, text, keywords)</span><br></pre></td></tr></table></figure>

<p><code>tag</code>标签参数，可以传一个标签的名称或多个标签名称组成的 Python 列表做标签参数</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">.findAll(&#123;<span class="string">"h1"</span>,<span class="string">"h2"</span>,<span class="string">"h3"</span>,<span class="string">"h4"</span>,<span class="string">"h5"</span>,<span class="string">"h6"</span>&#125;)</span><br></pre></td></tr></table></figure>

<p><code>attributes</code>属性参数，是用一个 Python 字典封装一个标签的若干属性和对应的属性值</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">.findAll(<span class="string">"span"</span>, &#123;<span class="string">"class"</span>:&#123;<span class="string">"green"</span>, <span class="string">"red"</span>&#125;&#125;)</span><br></pre></td></tr></table></figure>

<p><code>recursive</code>递归参数，是一个布尔变量，设置为True，查找标签参数的所有子标签，以及子标签的子标签；设置为 False，就只查找文档的一级标签。（默认值是 True）</p>
<p><code>text</code>文本参数，是用标签的文本内容去匹配，而不是用标签的属性。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">#%%</span></span><br><span class="line">html = urlopen(<span class="string">"http://www.pythonscraping.com/pages/warandpeace.html"</span>)</span><br><span class="line">bsObj = BeautifulSoup(html, features=<span class="string">"lxml"</span>)</span><br><span class="line">nameList = bsObj.findAll(text=<span class="string">"the prince"</span>)</span><br><span class="line">print(len(nameList))  <span class="comment"># 7</span></span><br></pre></td></tr></table></figure>

<p><code>limit</code>范围限制参数，只用于<code>findAll</code>方法，获得的前几项结果是按照网页上的顺序排序</p>
<blockquote>
<p><code>find</code>等价于<code>findAll</code>的<code>limit</code>等于1时的情形</p>
</blockquote>
<p><code>keyword</code>关键词参数，用于选择具有指定属性的标签</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment">#%%</span></span><br><span class="line">html = urlopen(<span class="string">"http://www.pythonscraping.com/pages/warandpeace.html"</span>)</span><br><span class="line">bsObj = BeautifulSoup(html, features=<span class="string">"lxml"</span>)</span><br><span class="line">allText = bsObj.findAll(id=<span class="string">"text"</span>)</span><br><span class="line">print(allText[<span class="number">0</span>].get_text())</span><br></pre></td></tr></table></figure>

<blockquote>
<p><code>class</code>保留词问题</p>
<p><code>bsObj.findAll(class=&quot;green&quot;)</code>语法错误，解决方案<code>bsObj.findAll(class_=&quot;green&quot;)</code>或<code>bsObj.findAll(&quot;&quot;, {&quot;class&quot;:&quot;green&quot;})</code></p>
</blockquote>
<h3 id="其他BeautifulSoup对象"><a href="#其他BeautifulSoup对象" class="headerlink" title="其他BeautifulSoup对象"></a>其他BeautifulSoup对象</h3><ul>
<li><code>NavigableString</code>对象：用来表示标签里的文字，不是标签</li>
<li><code>Comment</code>对象：用来查找 HTML 文档的注释标签</li>
</ul>

    </div>

    
    
    
      
  <div class="popular-posts-header">相关文章</div>
  <ul class="popular-posts">
    <li class="popular-posts-item">
      <div class="popular-posts-title"><a href="\2020\03\23\Java\Java书单\" rel="bookmark">计算机书单</a></div>
    </li>
    <li class="popular-posts-item">
      <div class="popular-posts-title"><a href="\2020\04\06\Python\Django\" rel="bookmark">Django</a></div>
    </li>
    <li class="popular-posts-item">
      <div class="popular-posts-title"><a href="\2020\03\27\Python\正则表达式\" rel="bookmark">正则表达式</a></div>
    </li>
    <li class="popular-posts-item">
      <div class="popular-posts-title"><a href="\2019\10\01\Python\函数式编程\" rel="bookmark">函数式编程</a></div>
    </li>
    <li class="popular-posts-item">
      <div class="popular-posts-title"><a href="\2019\10\18\Python\面向对象\" rel="bookmark">面向对象</a></div>
    </li>
  </ul>


      <footer class="post-footer">
          
          <div class="post-tags">
              <a href="/tags/Python/" rel="tag"><i class="fa fa-tag"></i> Python</a>
              <a href="/tags/%E7%88%AC%E8%99%AB/" rel="tag"><i class="fa fa-tag"></i> 爬虫</a>
          </div>

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2020/03/24/%E7%BB%9F%E8%AE%A1%E5%AD%A6/%E7%94%9F%E7%89%A9%E7%BB%9F%E8%AE%A1%E5%AD%A6/" rel="prev" title="生物统计学">
      <i class="fa fa-chevron-left"></i> 生物统计学
    </a></div>
      <div class="post-nav-item">
    <a href="/2020/03/24/%E6%A4%8D%E7%89%A9%E7%94%9F%E7%90%86%E5%AD%A6/zwslx_2_%E6%A4%8D%E7%89%A9%E7%9A%84%E7%9F%BF%E7%89%A9%E8%B4%A8%E8%90%A5%E5%85%BB/" rel="next" title="植物的矿物质营养">
      植物的矿物质营养 <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  



          </div>
          

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#网络数据采集的基本原理"><span class="nav-number">1.</span> <span class="nav-text">网络数据采集的基本原理</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Python实现网络连接"><span class="nav-number">2.</span> <span class="nav-text">Python实现网络连接</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#urllib"><span class="nav-number">2.1.</span> <span class="nav-text">urllib</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#BeautifulSoup"><span class="nav-number">2.2.</span> <span class="nav-text">BeautifulSoup</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#网络连接异常"><span class="nav-number">2.3.</span> <span class="nav-text">网络连接异常</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#HTML解析"><span class="nav-number">3.</span> <span class="nav-text">HTML解析</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#BeautifulSoup的find-和findAll"><span class="nav-number">3.1.</span> <span class="nav-text">BeautifulSoup的find()和findAll()</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#其他BeautifulSoup对象"><span class="nav-number">3.2.</span> <span class="nav-text">其他BeautifulSoup对象</span></a></li></ol></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="Lanqilu"
      src="http://img.whl123456.top/image/avatar.jpg">
  <p class="site-author-name" itemprop="name">Lanqilu</p>
  <div class="site-description" itemprop="description"></div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">85</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">18</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/tags/">
          
        <span class="site-state-item-count">39</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author motion-element">
      <span class="links-of-author-item">
        <a href="https://github.com/lanqilu" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;lanqilu" rel="noopener" target="_blank"><i class="fa fa-fw fa-github"></i></a>
      </span>
      <span class="links-of-author-item">
        <a href="http://mail.qq.com/cgi-bin/qm_share?t=qm_mailme&email=zqKvoL_noruOqKG2o6_nouCtoaM" title="E-Mail → http:&#x2F;&#x2F;mail.qq.com&#x2F;cgi-bin&#x2F;qm_share?t&#x3D;qm_mailme&amp;email&#x3D;zqKvoL_noruOqKG2o6_nouCtoaM" rel="noopener" target="_blank"><i class="fa fa-fw fa-envelope"></i></a>
      </span>
  </div>



      </div>
        <div class="back-to-top motion-element">
          <i class="fa fa-arrow-up"></i>
          <span>0%</span>
        </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

        

<div class="copyright">
  
  &copy; 2019 – 
  <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">Lanqilu</span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Gemini</a> 强力驱动
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="/lib/anime.min.js"></script>
  <script src="/lib/pjax/pjax.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/jquery@3/dist/jquery.min.js"></script>
  <script src="//cdn.jsdelivr.net/gh/fancyapps/fancybox@3/dist/jquery.fancybox.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/pangu@4/dist/browser/pangu.min.js"></script>
  <script src="/lib/velocity/velocity.min.js"></script>
  <script src="/lib/velocity/velocity.ui.min.js"></script>

<script src="/js/utils.js"></script>

<script src="/js/motion.js"></script>


<script src="/js/schemes/pisces.js"></script>


<script src="/js/next-boot.js"></script>

  <script>
var pjax = new Pjax({
  selectors: [
    'head title',
    '#page-configurations',
    '.content-wrap',
    '.post-toc-wrap',
    '.languages',
    '#pjax'
  ],
  switches: {
    '.post-toc-wrap': Pjax.switches.innerHTML
  },
  analytics: false,
  cacheBust: false,
  scrollTo : !CONFIG.bookmark.enable
});

window.addEventListener('pjax:success', () => {
  document.querySelectorAll('script[data-pjax], script#page-configurations, #pjax script').forEach(element => {
    var code = element.text || element.textContent || element.innerHTML || '';
    var parent = element.parentNode;
    parent.removeChild(element);
    var script = document.createElement('script');
    if (element.id) {
      script.id = element.id;
    }
    if (element.className) {
      script.className = element.className;
    }
    if (element.type) {
      script.type = element.type;
    }
    if (element.src) {
      script.src = element.src;
      // Force synchronous loading of peripheral JS.
      script.async = false;
    }
    if (element.dataset.pjax !== undefined) {
      script.dataset.pjax = '';
    }
    if (code !== '') {
      script.appendChild(document.createTextNode(code));
    }
    parent.appendChild(script);
  });
  NexT.boot.refresh();
  // Define Motion Sequence & Bootstrap Motion.
  if (CONFIG.motion.enable) {
    NexT.motion.integrator
      .init()
      .add(NexT.motion.middleWares.subMenu)
      .add(NexT.motion.middleWares.postList)
      .bootstrap();
  }
  NexT.utils.updateSidebarPosition();
});
</script>




  




  
<script src="/js/local-search.js"></script>













    <div id="pjax">
  

  

  

    </div>
</body>
</html>
